In [1]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import os
import random

In [2]:
rootdir = "C:\\Users\\Shantnu\\Desktop\\Data Sources\\Enron Spam"

In [3]:
# Loop through all the directories, sub directories and files in the above folder, and print them.

# For files, print number of files.

for directories, subdirs, files in os.walk(rootdir):
    print(directories, subdirs, len(files))


C:\Users\Shantnu\Desktop\Data Sources\Enron Spam ['enron1', 'enron2', 'enron3', 'enron4', 'enron5', 'enron6'] 0
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron1 ['ham', 'spam'] 1
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron1\ham [] 3672
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron1\spam [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron2 ['ham', 'spam'] 1
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron2\ham [] 4361
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron2\spam [] 1496
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron3 ['ham', 'spam'] 1
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron3\ham [] 4012
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron3\spam [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron4 ['ham', 'spam'] 1
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron4\ham [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron4\spam [] 4500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron5 ['ham', 'spam'] 1
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron5\ham [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron5\spam [] 3675
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron6 ['ham', 'spam'] 1
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron6\ham [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron6\spam [] 4500

In [4]:
print(os.path.split("C:\\Users\\Shantnu\\Desktop\\Data Sources\\Enron Spam\\enron1\\ham"))
print(os.path.split("C:\\Users\\Shantnu\\Desktop\\Data Sources\\Enron Spam\\enron1\\ham")[0])
print(os.path.split("C:\\Users\\Shantnu\\Desktop\\Data Sources\\Enron Spam\\enron1\\ham")[1])


('C:\\Users\\Shantnu\\Desktop\\Data Sources\\Enron Spam\\enron1', 'ham')
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron1
ham

In [5]:
# Same as before, but only print the ham and spam folders
for directories, subdirs, files in os.walk(rootdir):
    if (os.path.split(directories)[1]  == 'ham'):
        print(directories, subdirs, len(files))
    
    if (os.path.split(directories)[1]  == 'spam'):
        print(directories, subdirs, len(files))


C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron1\ham [] 3672
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron1\spam [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron2\ham [] 4361
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron2\spam [] 1496
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron3\ham [] 4012
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron3\spam [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron4\ham [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron4\spam [] 4500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron5\ham [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron5\spam [] 3675
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron6\ham [] 1500
C:\Users\Shantnu\Desktop\Data Sources\Enron Spam\enron6\spam [] 4500

In [6]:
ham_list = []
spam_list = []

# Same as before, but this time, read the files, and append them to the ham and spam list
for directories, subdirs, files in os.walk(rootdir):
    if (os.path.split(directories)[1]  == 'ham'):
        for filename in files:      
            with open(os.path.join(directories, filename), encoding="latin-1") as f:
                data = f.read()
                ham_list.append(data)
    
    if (os.path.split(directories)[1]  == 'spam'):
        for filename in files:
            with open(os.path.join(directories, filename), encoding="latin-1") as f:
                data = f.read()
                spam_list.append(data)


print(ham_list[0])
print(spam_list[0])


Subject: christmas tree farm pictures

Subject: dobmeos with hgh my energy level has gone up ! stukm
introducing
doctor - formulated
hgh
human growth hormone - also called hgh
is referred to in medical science as the master hormone . it is very plentiful
when we are young , but near the age of twenty - one our bodies begin to produce
less of it . by the time we are forty nearly everyone is deficient in hgh ,
and at eighty our production has normally diminished at least 90 - 95 % .
advantages of hgh :
- increased muscle strength
- loss in body fat
- increased bone density
- lower blood pressure
- quickens wound healing
- reduces cellulite
- improved vision
- wrinkle disappearance
- increased skin thickness texture
- increased energy levels
- improved sleep and emotional stability
- improved memory and mental alertness
- increased sexual potency
- resistance to common illness
- strengthened heart muscle
- controlled cholesterol
- controlled mood swings
- new hair growth and color restore
read
more at this website
unsubscribe


In [7]:
# Write a function , that when passed in words, will return a dictionary of the form

# {Word1: True, Word2: True, Words3: True}

# Removing stop words is optional

def create_word_features(words):
    my_dict = dict( [ (word, True) for word in words] )
    return my_dict

create_word_features(["the", "quick", "brown", "quick", "a", "fox"])


Out[7]:
{'a': True, 'brown': True, 'fox': True, 'quick': True, 'the': True}

In [8]:
ham_list = []
spam_list = []

# Same as before, but this time:

# 1. Break the sentences into words using word_tokenize
# 2. Use the create_word_features() function you just wrote
for directories, subdirs, files in os.walk(rootdir):
    if (os.path.split(directories)[1]  == 'ham'):
        for filename in files:      
            with open(os.path.join(directories, filename), encoding="latin-1") as f:
                data = f.read()
                
                # The data we read is one big string. We need to break it into words.
                words = word_tokenize(data)
                
                ham_list.append((create_word_features(words), "ham"))
    
    if (os.path.split(directories)[1]  == 'spam'):
        for filename in files:
            with open(os.path.join(directories, filename), encoding="latin-1") as f:
                data = f.read()
                
                # The data we read is one big string. We need to break it into words.
                words = word_tokenize(data)
                
                spam_list.append((create_word_features(words), "spam"))
print(ham_list[0])
print(spam_list[0])


({'Subject': True, 'pictures': True, ':': True, 'farm': True, 'christmas': True, 'tree': True}, 'ham')
({'production': True, 'read': True, 'is': True, 'sexual': True, 'improved': True, 'age': True, 'strength': True, '.': True, 'medical': True, 'healing': True, 'website': True, 'introducing': True, 'hormone': True, 'color': True, 'master': True, 'controlled': True, 'levels': True, 'restore': True, 'as': True, 'cellulite': True, '-': True, 'least': True, 'energy': True, 'bone': True, 'near': True, '90': True, 'diminished': True, 'eighty': True, '!': True, 'mood': True, 'our': True, 'level': True, ',': True, 'twenty': True, 'to': True, 'nearly': True, 'time': True, 'human': True, '%': True, 'this': True, 'called': True, 'cholesterol': True, 'referred': True, 'normally': True, 'illness': True, 'emotional': True, 'one': True, 'of': True, 'stukm': True, 'dobmeos': True, 'bodies': True, 'hgh': True, 'wrinkle': True, 'but': True, 'deficient': True, 'skin': True, 'texture': True, 'memory': True, 'pressure': True, 'quickens': True, 'muscle': True, 'produce': True, 'lower': True, 'body': True, 'advantages': True, 'heart': True, 'unsubscribe': True, 'begin': True, 'in': True, 'Subject': True, 'resistance': True, 'strengthened': True, 'thickness': True, 'by': True, 'growth': True, 'stability': True, 'increased': True, 'density': True, 'disappearance': True, 'loss': True, 'and': True, 'everyone': True, 'new': True, 'young': True, 'are': True, 'vision': True, 'blood': True, 'hair': True, 'very': True, 'at': True, 'sleep': True, 'forty': True, 'doctor': True, 'the': True, 'when': True, 'gone': True, 'mental': True, ':': True, 'alertness': True, 'up': True, 'wound': True, 'potency': True, 'it': True, 'also': True, 'more': True, 'formulated': True, 'common': True, 'my': True, 'has': True, 'science': True, '95': True, 'plentiful': True, 'fat': True, 'reduces': True, 'we': True, 'with': True, 'swings': True, 'less': True}, 'spam')

In [9]:
combined_list = ham_list + spam_list
print(len(combined_list))

random.shuffle(combined_list)


33716

In [12]:
# Create a test and train section.

# 70% of the data is training. 30% is test

training_part = int(len(combined_list) * .7)

print(len(combined_list))

training_set = combined_list[:training_part]

test_set =  combined_list[training_part:]

print (len(training_set))
print (len(test_set))


33716
23601
10115

In [13]:
# Create the Naive Bayes filter

classifier = NaiveBayesClassifier.train(training_set)

# Find the accuracy, using the test data

accuracy = nltk.classify.util.accuracy(classifier, test_set)

print("Accuracy is: ", accuracy * 100)


Accuracy is:  98.50716757291151

In [14]:
classifier.show_most_informative_features(20)


Most Informative Features
                   enron = True              ham : spam   =   3588.6 : 1.0
                     hpl = True              ham : spam   =    577.0 : 1.0
                     php = True             spam : ham    =    416.2 : 1.0
                     713 = True              ham : spam   =    326.3 : 1.0
                  louise = True              ham : spam   =    299.2 : 1.0
                     xls = True              ham : spam   =    281.8 : 1.0
                 stinson = True              ham : spam   =    267.4 : 1.0
                crenshaw = True              ham : spam   =    251.5 : 1.0
                     ect = True              ham : spam   =    231.2 : 1.0
                   corel = True             spam : ham    =    220.5 : 1.0
              macromedia = True             spam : ham    =    210.9 : 1.0
              scheduling = True              ham : spam   =    209.6 : 1.0
                        = True              ham : spam   =    184.1 : 1.0
                     sex = True             spam : ham    =    182.3 : 1.0
                      xp = True             spam : ham    =    172.6 : 1.0
                   daren = True              ham : spam   =    168.7 : 1.0
                    1933 = True             spam : ham    =    152.1 : 1.0
                    spam = True             spam : ham    =    145.1 : 1.0
                 parsing = True              ham : spam   =    137.6 : 1.0
                   penis = True             spam : ham    =    117.2 : 1.0

In [15]:
# Clasify the below as spam or ham

# Hint: 1. Break into words using word_tokenzise
# 2. create_word_features
# 3. Use the classify function

msg1 = '''Hello th̓ere seُx master :-)
i need c0ck ri͏ght noِw ..͏. don't tell my hǔbbٚy.ٚ. ))
My sc͕rٞeٚe̻nname is Dorry.
My accֺo֔unt is h֯ere: http:nxusxbnd.GirlsBadoo.ru
C u late٘r!'''


msg2 = '''As one of our top customers we are providing 10% OFF the total of your next used book purchase from www.letthestoriesliveon.com. Please use the promotional code, TOPTENOFF at checkout. Limited to 1 use per customer. All books have free shipping within the contiguous 48 United States and there is no minimum purchase.

We have millions of used books in stock that are up to 90% off MRSP and add tens of thousands of new items every day. Don’t forget to check back frequently for new arrivals.'''



msg3 = '''To start off, I have a 6 new videos + transcripts in the members section. In it, we analyse the Enron email dataset, half a million files, spread over 2.5GB. It's about 1.5 hours of  video.

I have also created a Conda environment for running the code (both free and member lessons). This is to ensure everyone is running the same version of libraries, preventing the Works on my machine problems. If you get a second, do you mind trying it here?'''

In [16]:
words = word_tokenize(msg1)
features = create_word_features(words)
print("Message 1 is :" ,classifier.classify(features))


Message 1 is : spam

In [18]:
words = word_tokenize(msg2)
features = create_word_features(words)
print("Message 2 is :" ,classifier.classify(features))


Message 2 is : spam

In [19]:
words = word_tokenize(msg3)
features = create_word_features(words)
print("Message 3 is :" ,classifier.classify(features))


Message 3 is : ham

In [ ]: